<?xml version="1.0" encoding="ascii"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
          "DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
  <title>pypln.pypln.apps.extractor</title>
  <link rel="stylesheet" href="epydoc.css" type="text/css" />
  <script type="text/javascript" src="epydoc.js"></script>
</head>

<body bgcolor="white" text="black" link="blue" vlink="#204080"
      alink="#204080">
<!-- ==================== NAVIGATION BAR ==================== -->
<table class="navbar" border="0" width="100%" cellpadding="0"
       bgcolor="#a0c0ff" cellspacing="0">
  <tr valign="middle">
  <!-- Home link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="pypln.pypln-module.html">Home</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Tree link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="module-tree.html">Trees</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Index link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Help link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Project homepage -->
      <th class="navbar" align="right" width="100%">
        <table border="0" cellpadding="0" cellspacing="0">
          <tr><th class="navbar" align="center"
            ><a class="navbar" target="_top" href="http://code.google.com/p/pypln">Project Homepage</a></th>
          </tr></table></th>
  </tr>
</table>
<table width="100%" cellpadding="0" cellspacing="0">
  <tr valign="top">
    <td width="100%">
      <span class="breadcrumbs">
        Package&nbsp;pypln ::
        <a href="pypln.pypln-module.html">Package&nbsp;pypln</a> ::
        <a href="pypln.pypln.apps-module.html">Package&nbsp;apps</a> ::
        Module&nbsp;extractor
      </span>
    </td>
    <td>
      <table cellpadding="0" cellspacing="0">
        <!-- hide/show private -->
        <tr><td align="right"><span class="options">[<a href="javascript:void(0);" class="privatelink"
    onclick="toggle_private();">hide&nbsp;private</a>]</span></td></tr>
      </table>
    </td>
  </tr>
</table>
<h1 class="epydoc">Source Code for <a href="pypln.pypln.apps.extractor-module.html">Module pypln.pypln.apps.extractor</a></h1>
<pre class="py-src">
<a name="L1"></a><tt class="py-lineno"> 1</tt>  <tt class="py-line"><tt class="py-comment">#!/usr/bin env python</tt> </tt>
<a name="L2"></a><tt class="py-lineno"> 2</tt>  <tt class="py-line"><tt class="py-comment">#-*- coding:utf-8 -*-</tt> </tt>
<a name="L3"></a><tt class="py-lineno"> 3</tt>  <tt class="py-line"><tt class="py-docstring">"""</tt> </tt>
<a name="L4"></a><tt class="py-lineno"> 4</tt>  <tt class="py-line"><tt class="py-docstring">New Document processor</tt> </tt>
<a name="L5"></a><tt class="py-lineno"> 5</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L6"></a><tt class="py-lineno"> 6</tt>  <tt class="py-line"><tt class="py-docstring">This apps scans a directory and submits new/changed documents to the preprocessing pipeline</tt> </tt>
<a name="L7"></a><tt class="py-lineno"> 7</tt>  <tt class="py-line"><tt class="py-docstring">Which will perform, in order:</tt> </tt>
<a name="L8"></a><tt class="py-lineno"> 8</tt>  <tt class="py-line"><tt class="py-docstring">* mimetype detection</tt> </tt>
<a name="L9"></a><tt class="py-lineno"> 9</tt>  <tt class="py-line"><tt class="py-docstring">* encoding detection</tt> </tt>
<a name="L10"></a><tt class="py-lineno">10</tt>  <tt class="py-line"><tt class="py-docstring">* text extraction and conversion if necessary</tt> </tt>
<a name="L11"></a><tt class="py-lineno">11</tt>  <tt class="py-line"><tt class="py-docstring">* Storage of extracted text in database</tt> </tt>
<a name="L12"></a><tt class="py-lineno">12</tt>  <tt class="py-line"><tt class="py-docstring">"""</tt> </tt>
<a name="L13"></a><tt class="py-lineno">13</tt>  <tt class="py-line"> </tt>
<a name="L14"></a><tt class="py-lineno">14</tt>  <tt class="py-line"><tt class="py-keyword">import</tt> <tt class="py-name">subprocess</tt> </tt>
<a name="L15"></a><tt class="py-lineno">15</tt>  <tt class="py-line"><tt class="py-keyword">import</tt> <tt class="py-name">os</tt> </tt>
<a name="L16"></a><tt class="py-lineno">16</tt>  <tt class="py-line"><tt class="py-keyword">import</tt> <tt class="py-name">sys</tt> </tt>
<a name="L17"></a><tt class="py-lineno">17</tt>  <tt class="py-line"><tt class="py-keyword">from</tt> <tt class="py-name">collections</tt> <tt class="py-keyword">import</tt> <tt class="py-name">defaultdict</tt> </tt>
<a name="L18"></a><tt class="py-lineno">18</tt>  <tt class="py-line"><tt class="py-keyword">import</tt> <tt class="py-name">mimetypes</tt> </tt>
<a name="L19"></a><tt class="py-lineno">19</tt>  <tt class="py-line"><tt class="py-keyword">from</tt> <tt class="py-name">glob</tt> <tt class="py-keyword">import</tt> <tt class="py-name">glob</tt> </tt>
<a name="L20"></a><tt class="py-lineno">20</tt>  <tt class="py-line"><tt class="py-keyword">from</tt> <tt id="link-0" class="py-name" targets="Package pypln.pypln=pypln.pypln-module.html"><a title="pypln.pypln" class="py-name" href="#" onclick="return doclink('link-0', 'pypln', 'link-0');">pypln</a></tt><tt class="py-op">.</tt><tt id="link-1" class="py-name" targets="Package pypln.pypln.servers=pypln.pypln.servers-module.html"><a title="pypln.pypln.servers" class="py-name" href="#" onclick="return doclink('link-1', 'servers', 'link-1');">servers</a></tt><tt class="py-op">.</tt><tt id="link-2" class="py-name" targets="Module pypln.pypln.servers.ventilator=pypln.pypln.servers.ventilator-module.html"><a title="pypln.pypln.servers.ventilator" class="py-name" href="#" onclick="return doclink('link-2', 'ventilator', 'link-2');">ventilator</a></tt> <tt class="py-keyword">import</tt> <tt id="link-3" class="py-name" targets="Class pypln.pypln.servers.ventilator.Ventilator=pypln.pypln.servers.ventilator.Ventilator-class.html"><a title="pypln.pypln.servers.ventilator.Ventilator" class="py-name" href="#" onclick="return doclink('link-3', 'Ventilator', 'link-3');">Ventilator</a></tt> </tt>
<a name="L21"></a><tt class="py-lineno">21</tt>  <tt class="py-line"> </tt>
<a name="get_md5_sum"></a><div id="get_md5_sum-def"><a name="L22"></a><tt class="py-lineno">22</tt> <a class="py-toggle" href="#" id="get_md5_sum-toggle" onclick="return toggle('get_md5_sum');">-</a><tt class="py-line"><tt class="py-keyword">def</tt> <a class="py-def-name" href="pypln.pypln.apps.extractor-module.html#get_md5_sum">get_md5_sum</a><tt class="py-op">(</tt><tt class="py-param">file</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="get_md5_sum-collapsed" style="display:none;" pad="++" indent="++++"></div><div id="get_md5_sum-expanded"><a name="L23"></a><tt class="py-lineno">23</tt>  <tt class="py-line">    <tt class="py-name">p</tt> <tt class="py-op">=</tt> <tt class="py-name">subprocess</tt><tt class="py-op">.</tt><tt class="py-name">Popen</tt><tt class="py-op">(</tt><tt class="py-op">[</tt><tt class="py-string">'md5sum'</tt><tt class="py-op">,</tt> <tt class="py-name">file</tt><tt class="py-op">]</tt><tt class="py-op">,</tt> <tt class="py-name">stdout</tt><tt class="py-op">=</tt><tt class="py-name">subprocess</tt><tt class="py-op">.</tt><tt class="py-name">PIPE</tt><tt class="py-op">)</tt> </tt>
<a name="L24"></a><tt class="py-lineno">24</tt>  <tt class="py-line">    <tt class="py-name">sto</tt><tt class="py-op">,</tt> <tt class="py-name">ste</tt> <tt class="py-op">=</tt> <tt class="py-name">p</tt><tt class="py-op">.</tt><tt class="py-name">communicate</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">[</tt><tt class="py-number">0</tt><tt class="py-op">]</tt> </tt>
<a name="L25"></a><tt class="py-lineno">25</tt>  <tt class="py-line">    <tt class="py-keyword">if</tt> <tt class="py-keyword">not</tt> <tt class="py-name">ste</tt><tt class="py-op">:</tt> </tt>
<a name="L26"></a><tt class="py-lineno">26</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">sto</tt> </tt>
</div><a name="L27"></a><tt class="py-lineno">27</tt>  <tt class="py-line"> </tt>
<a name="scan_dir"></a><div id="scan_dir-def"><a name="L28"></a><tt class="py-lineno">28</tt> <a class="py-toggle" href="#" id="scan_dir-toggle" onclick="return toggle('scan_dir');">-</a><tt class="py-line"><tt class="py-keyword">def</tt> <a class="py-def-name" href="pypln.pypln.apps.extractor-module.html#scan_dir">scan_dir</a><tt class="py-op">(</tt><tt class="py-param">path</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="scan_dir-collapsed" style="display:none;" pad="++" indent="++++"></div><div id="scan_dir-expanded"><a name="L29"></a><tt class="py-lineno">29</tt>  <tt class="py-line">    <tt class="py-name">filelist</tt> <tt class="py-op">=</tt> <tt class="py-name">glob</tt><tt class="py-op">(</tt><tt class="py-name">path</tt><tt class="py-op">+</tt><tt class="py-string">'*'</tt><tt class="py-op">)</tt> </tt>
<a name="L30"></a><tt class="py-lineno">30</tt>  <tt class="py-line">    <tt class="py-name">docdict</tt> <tt class="py-op">=</tt> <tt class="py-name">defaultdict</tt><tt class="py-op">(</tt><tt class="py-keyword">lambda</tt><tt class="py-op">:</tt><tt class="py-op">[</tt><tt class="py-op">]</tt><tt class="py-op">)</tt> </tt>
<a name="L31"></a><tt class="py-lineno">31</tt>  <tt class="py-line">    <tt class="py-keyword">for</tt> <tt class="py-name">f</tt> <tt class="py-keyword">in</tt> <tt class="py-name">filelist</tt><tt class="py-op">:</tt> </tt>
<a name="L32"></a><tt class="py-lineno">32</tt>  <tt class="py-line">        <tt class="py-keyword">if</tt> <tt class="py-name">os</tt><tt class="py-op">.</tt><tt class="py-name">isfile</tt><tt class="py-op">(</tt><tt class="py-name">f</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L33"></a><tt class="py-lineno">33</tt>  <tt class="py-line">            <tt class="py-name">mt</tt> <tt class="py-op">=</tt> <tt class="py-name">mimetypes</tt><tt class="py-op">.</tt><tt class="py-name">guess_type</tt><tt class="py-op">(</tt><tt class="py-name">msg</tt><tt class="py-op">)</tt><tt class="py-op">[</tt><tt class="py-number">0</tt><tt class="py-op">]</tt> </tt>
<a name="L34"></a><tt class="py-lineno">34</tt>  <tt class="py-line">            <tt class="py-comment">#classify documents by mimetype</tt> </tt>
<a name="L35"></a><tt class="py-lineno">35</tt>  <tt class="py-line">            <tt class="py-name">fullpath</tt> <tt class="py-op">=</tt> <tt class="py-name">os</tt><tt class="py-op">.</tt><tt class="py-name">path</tt><tt class="py-op">.</tt><tt class="py-name">join</tt><tt class="py-op">(</tt><tt class="py-name">path</tt><tt class="py-op">,</tt> <tt class="py-name">f</tt><tt class="py-op">)</tt> </tt>
<a name="L36"></a><tt class="py-lineno">36</tt>  <tt class="py-line">            <tt class="py-name">md5s</tt> <tt class="py-op">=</tt> <tt id="link-4" class="py-name" targets="Function pypln.pypln.apps.extractor.get_md5_sum()=pypln.pypln.apps.extractor-module.html#get_md5_sum"><a title="pypln.pypln.apps.extractor.get_md5_sum" class="py-name" href="#" onclick="return doclink('link-4', 'get_md5_sum', 'link-4');">get_md5_sum</a></tt><tt class="py-op">(</tt><tt class="py-name">fullpath</tt><tt class="py-op">)</tt> </tt>
<a name="L37"></a><tt class="py-lineno">37</tt>  <tt class="py-line">            <tt class="py-name">docdict</tt><tt class="py-op">[</tt><tt class="py-name">mt</tt><tt class="py-op">]</tt><tt class="py-op">.</tt><tt class="py-name">append</tt><tt class="py-op">(</tt><tt class="py-op">(</tt><tt class="py-name">fullpath</tt><tt class="py-op">,</tt> <tt class="py-name">md5s</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
<a name="L38"></a><tt class="py-lineno">38</tt>  <tt class="py-line">    <tt class="py-keyword">return</tt> <tt class="py-name">docdict</tt> </tt>
</div><a name="L39"></a><tt class="py-lineno">39</tt>  <tt class="py-line">     </tt>
<a name="send_tasks"></a><div id="send_tasks-def"><a name="L40"></a><tt class="py-lineno">40</tt> <a class="py-toggle" href="#" id="send_tasks-toggle" onclick="return toggle('send_tasks');">-</a><tt class="py-line"><tt class="py-keyword">def</tt> <a class="py-def-name" href="pypln.pypln.apps.extractor-module.html#send_tasks">send_tasks</a><tt class="py-op">(</tt><tt class="py-param">docs</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="send_tasks-collapsed" style="display:none;" pad="++" indent="++++"></div><div id="send_tasks-expanded"><a name="L41"></a><tt class="py-lineno">41</tt>  <tt class="py-line">    <tt class="py-keyword">pass</tt> </tt>
</div><a name="L42"></a><tt class="py-lineno">42</tt>  <tt class="py-line"><tt class="py-keyword">if</tt> <tt class="py-name">__name__</tt><tt class="py-op">==</tt><tt class="py-string">"__main__"</tt><tt class="py-op">:</tt> </tt>
<a name="L43"></a><tt class="py-lineno">43</tt>  <tt class="py-line">    <tt class="py-keyword">if</tt> <tt class="py-name">len</tt><tt class="py-op">(</tt><tt class="py-name">sys</tt><tt class="py-op">.</tt><tt class="py-name">argv</tt><tt class="py-op">)</tt> <tt class="py-op">&gt;</tt><tt class="py-number">1</tt><tt class="py-op">:</tt> </tt>
<a name="L44"></a><tt class="py-lineno">44</tt>  <tt class="py-line">        <tt class="py-name">p</tt> <tt class="py-op">=</tt> <tt class="py-name">sys</tt><tt class="py-op">.</tt><tt class="py-name">argv</tt><tt class="py-op">[</tt><tt class="py-number">1</tt><tt class="py-op">]</tt> </tt>
<a name="L45"></a><tt class="py-lineno">45</tt>  <tt class="py-line">        <tt id="link-5" class="py-name" targets="Function pypln.pypln.apps.extractor.scan_dir()=pypln.pypln.apps.extractor-module.html#scan_dir"><a title="pypln.pypln.apps.extractor.scan_dir" class="py-name" href="#" onclick="return doclink('link-5', 'scan_dir', 'link-5');">scan_dir</a></tt><tt class="py-op">(</tt><tt class="py-name">p</tt><tt class="py-op">)</tt> </tt>
<a name="L46"></a><tt class="py-lineno">46</tt>  <tt class="py-line"> </tt><script type="text/javascript">
<!--
expandto(location.href);
// -->
</script>
</pre>
<br />
<!-- ==================== NAVIGATION BAR ==================== -->
<table class="navbar" border="0" width="100%" cellpadding="0"
       bgcolor="#a0c0ff" cellspacing="0">
  <tr valign="middle">
  <!-- Home link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="pypln.pypln-module.html">Home</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Tree link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="module-tree.html">Trees</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Index link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Help link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Project homepage -->
      <th class="navbar" align="right" width="100%">
        <table border="0" cellpadding="0" cellspacing="0">
          <tr><th class="navbar" align="center"
            ><a class="navbar" target="_top" href="http://code.google.com/p/pypln">Project Homepage</a></th>
          </tr></table></th>
  </tr>
</table>
<table border="0" cellpadding="0" cellspacing="0" width="100%%">
  <tr>
    <td align="left" class="footer">
    Generated by Epydoc 3.0.1 on Tue Jul 26 17:47:30 2011
    </td>
    <td align="right" class="footer">
      <a target="mainFrame" href="http://epydoc.sourceforge.net"
        >http://epydoc.sourceforge.net</a>
    </td>
  </tr>
</table>

<script type="text/javascript">
  <!--
  // Private objects are initially displayed (because if
  // javascript is turned off then we want them to be
  // visible); but by default, we want to hide them.  So hide
  // them unless we have a cookie that says to show them.
  checkCookie();
  // -->
</script>
</body>
</html>
